global projectdir "~"
global datadir "$projectdir/data"
global intermed "$projectdir/data/intermediate_files"
global alex_transfer  "$projectdir/data/transfer"

set more off

*******SETUP*******


*** Create PI variable

use "$datadir/raw_pulls/umetrics/FSRDC_2018/core_employee2019q1_pik.dta", clear
gen byte faculty=(umetrics_occ=="Faculty"|umetrics_occ=="faculty")
ren institution_id submit_university

replace proportion=0 if proportion<0
replace proportion=1 if proportion>1 & proportion<.
bysort unique_award emp_number: egen temp=sum(faculty*proportion)
gsort unique_award -temp fte_status period_end_date period_start_date -proportion
by unique_award: gen temp2=_n

preserve
keep if temp2==1 & faculty==1
egen pinumber=group(emp_number)
keep unique_award pinumber
save "$intermed/pi_award", replace
restore

merge m:1 unique_award using "$intermed/pi_award", keep(1 3) nogen

drop if pi==.

gen year_end=year(period_end)
bysort emp_number year_end pinumber: egen temp3=sum(proportion)
gsort emp_number year_end -temp3
by emp_number year_end: gen temp4=_n

keep if temp4==1
ren year_end year
keep emp_number pinumber year
ren emp_number pik

save "$intermed/pi", replace
 //this is a pik-year level dataset. a pik can be assoc with mult pi numbers


*** 4 Merge PI variable to main dataset
u "$intermed/pi", clear
merge 1:m pik year using "$datadir/jobhist_lehd_E.dta"
drop if _m==1
replace pinumber=0 if pinumber==.
drop _m

save "$datadir/jobhist_lehd_E_06102021.dta", replace
 
u "$datadir/jobhist_lehd_E_06102021.dta", clear
sort pik year
gen pinumber_infill = pinumber if pinumber!=0

forvalues i=1/35{
replace pinumber_infill=pinumber_infill[_n-1] if missing(pinumber_infill) & !missing(pinumber_infill[_n-1]) & pik==pik[_n-1]
replace pinumber_infill=pinumber_infill[_n+1] if missing(pinumber_infill) & !missing(pinumber_infill[_n+1]) & pik==pik[_n+1]
}
replace pinumber_infill = pinumber if missing(pinumber_infill)

save "$datadir/jobhist_lehd_E_06102021.dta", replace


* 5 Add the new shocks and matching approach
//treatment: people who have temporary neg shock. control is people with no large neg shocks.

****************************************
*******DEFINE NEGATIVE SHOCKS************
****************************************

*cutoff for shock
local cutoff -0.4
*cutoff for previous years
local cutoff2 -0.3

 //Treatment group: At CFDA code level: 40% decline and reverts in future, and no pos or neg shock in year before the 40% decline. Treated people get more than 50% of funding from a treated CFDA
 //Control group: Control CFDAs never have a negative shock >40%. 
 //Omitted group: CFDAs with at least a 40% decline but either is permanent, never returns to previous level. Or there is a positive or neg shock in year before.

use "$alex_transfer/singleaudit_cfda", clear
joinby cfda using "$alex_transfer/cfda_proportion"
drop if year<minyear
drop if year>maxyear
egen cfda_num = group(cfda)
tsset cfda_num year
gen logrd=log(amount_rd)
gen d_logrd=log(amount_rd)-log(L.amount_rd)
by cfda_num: egen mostnegativeshock=min(d_logrd)
by cfda_num: egen number_year=sum(year>=2010)
drop if number_year<=5

gen byte temp_neg_shock= (d_logrd<`cutoff')

gen futuremax=log(F.amount_rd)
forv i=2/10 {
replace futuremax=log(F`i'.amount_rd) if log(F`i'.amount_rd)>futuremax & log(F`i'.amount_rd)<.
}

gen diff=futuremax-log(L.amount_rd)
replace temp_neg_shock=0 if diff<0.1*`cutoff' | diff==.
replace temp_neg_shock=0 if L.d_logrd>-`cutoff2' | (L2.d_logrd>-`cutoff2' & L2.d_logrd<.)
replace temp_neg_shock=0 if L.d_logrd<`cutoff2' | L2.d_logrd<`cutoff2'

gen temp_year=year if temp_neg_shock==1
by cfda_num: egen year_shock=min(temp_year)

by cfda_num: egen nshock=sum(temp_neg_shock)

*plot funding overtime
gen tau=year-year_shock
forv i=5(-1)2 {
    gen taum`i'=(tau==-`i')
}
forv i=0/5 {
    gen tau`i'=(tau==`i')
}
reghdfe logrd taum5-tau5 if nshock==1 & tau^2<=25, absorb(cfda_num)

drop if mostnegativeshock<0.75*`cutoff' & nshock==0

collapse nshock year_shock*, by (cfda)
compress
save "$intermed/cfda_shocks", replace


*construct treatment and control group
use "$alex_transfer/employee_cfda", clear
merge m:1 cfda using "$alex_transfer/cfda_shocks", keep(3)
gen share_treatment=proportion if nshock>0
bysort iris: egen maxshare_treatment=max(share_treatment)
drop if share_treatment!=.&share_treatment!=maxshare_treatment
gen share_control=proportion if nshock==0
collapse (sum) share_treatment share_control (mean) year_shock* minyear maxyear (max) nshock, by(iris)
*replace year_shock=year_shock2 if year_shock<minyear & year_shock2>minyear & year_shock2<.
*replace year_shock=year_shock3 if year_shock2<minyear & year_shock3>minyear & year_shock3<.
count if share_control>=0.5 & share_treatment==0
count if share_treatment>=0.5 & year_shock>minyear
*a lot of people dropped because year shock < minyear
gen treated=1 if share_treatment>=0.5 & year_shock>minyear
replace treated=0 if share_control>=0.5 & share_treatment==0
drop if treated==.
keep iris treated year_shock minyear
compress
save "$intermed/employee_shocks", replace

*** ADD SHOCKS TO MAIN DATA
u "$datadir/jobhist_lehd_E_06102021.dta", clear
merge m:1 iris_employee_number using  "$intermed/employee_shocks"
*******NOTE NOTE NOTE: I don't see any reason to keep anything but treatment and contols here, as it just makes things complcated later 
drop if _m==1
drop if _m==2
drop _m

* Add occupational categorical var and new outcome variables
gen occup_cats=1 if occup_faculty==1
replace occup_cats=2 if occup_gradpostdoc==1
replace occup_cats=3 if occup_undergradstudent==1
replace occup_cats=4 if occup_other_staff==1

** Make variables for event study
gen tau=year-year_shock
replace tau=0 if treated==0 
*tab tau, gen(shock_dummies)
gen post=tau>1 & tau!=.
save "$datadir/jobhist_lehd_E_06102021.dta", replace



*** Add entrep data ******
gen ht_overage=1/(firmage_ht+1) if firmage_ht>0
replace ht_overage = 1 if firmage_ht <=0
replace ht_overage = 0 if missing(ht_overage)

gen overage=1/(firmage_priv+1) if firmage_priv>0
replace overage = 1 if firmage_priv <=0
replace overage = 0 if missing(firmage_priv)

replace count_ht_mode_start_2012=ht_mode_start_2012 if count_ht_mode_start_2012==0
replace count_start=start if count_start==0
replace count_ht_mode_young_2012 = ht_mode_young_2012 if count_ht_mode_young_2012==0

compress
save "$datadir/jobhist_lehd_E_06102021.dta", replace


** Fields & R1 Univs
gen field_science = (fieldid== 4 | fieldid== 5 | fieldid== 12 | fieldid== 15 | fieldid== 16)
gen field_engin = (fieldid== 8) & field_science==0
gen field_bioMedPharma = (fieldid== 2 | fieldid==13| fieldid==10) & field_science==0 & field_engin==0
gen field_other = (field_science==0 & field_engin==0 & field_bioMedPharma==0) 

gen R1_if_univ=(carnegie==15 & alt_carnegie_max==15) if alt_univ_ein_max==1
gen R1=(carnegie==15 & alt_carnegie_max==15)

tsset emp_num year

*In Sample
reghdfe count_ht_mode_start_2012 post, a(i.year#i.submit_u#i.fieldid i.pinumber_infill) vce(cluster emp_num) keepsingletons 
gen in_sample=e(sample)
save "$datadir/jobhist_lehd_E_06102021.dta" , replace




use "$datadir/jobhist_lehd_E_06102021.dta" , clear
* Impute CFDA num so same sample
gen modal_cfda2=modal_cfda
replace modal_cfda2=0 if modal_cfda==.

sum modal_cfda if in_sample==1, detail
replace modal_cfda=r(p50) if modal_cfda==.

gen field_univ=fieldid*1000000+submit_u

save "$datadir/jobhist_lehd_E_06102021.dta" , replace
